The aim is to combine label transfer and CNV inference to annotate Wilms tumor samples in SCPCP000006. The proposed annotation will be based on the combination of:
the label transfer from the fetal kidney reference (Stewart et
al.), in particular the fetal_kidney_predicted.compartment
and fetal_kidney:predicted.cell_type, as well as the
prediction.score for each compartment,
the predicted CNV calculated using intra-sample endothelial and
immune cells (--reference both) as normal reference; no
reference was used for samples with fewer than 3 predicted normal
cells.
In a second time, we will explore and validate the chosen annotation.
We will use some of the markers genes to validate visually the annotations.
The analysis can be summarized as the following:
Where cnv.thr and pred.thr need to be
discussed
| first level annotation | second level annotation | selection of the cells | marker genes for validation | CNV validation |
|---|---|---|---|---|
| normal | endothelial | compartment == "endothelium" & predicted.score > pred.thr |
VWF |
no CNV |
| normal | immune | compartment == "immune" & predicted.score > pred.thr |
PTPRC, CD163, CD68 |
no CNV |
| normal | kidney | compartment == "fetal_nephron" & predicted.score > pred.thr & cnv_score < cnv.thr |
CDH1, PODXL, LTL |
no CNV |
| normal | stroma | compartment == "stroma" & predicted.score > pred.thr & cnv_score < cnv.thr |
VIM |
no CNV |
| cancer | stroma | compartment == "stroma" & cnv_score > cnv.thr |
VIM |
proportion_cnv_chr: 1, 4, 11, 16, 17, 18 |
| cancer | blastema | compartment == "fetal_nephron" & cell_type == "mesenchymal cell" & cnv_score > cnv.thr |
CITED1 |
proportion_cnv_chr: 1, 4, 11, 16, 17, 18 |
| cancer | epithelial | compartment == "fetal_nephron" & cell_type != "mesenchymal cell" & cnv_score > cnv.thr |
CDH1 |
proportion_cnv_chr: 1, 4, 11, 16, 17, 18 |
| unknown | - | the rest of the cells | - | - |
# The base path for the OpenScPCA repository, found by its (hidden) .git directory
repository_base <- rprojroot::find_root(rprojroot::is_git_root)
# The current data directory, found within the repository base directory
data_dir <- file.path(repository_base, "data", "current", "SCPCP000006")
# The path to this module
module_base <- file.path(repository_base, "analyses", "cell-type-wilms-tumor-06")
result_dir <- file.path(module_base, "results")The report will be saved in the notebook directory. The
final annotation file SCPCP000006-annotations.tsv will be
saved in the results directory.
In this notebook, we are working with all of the samples in SCPCP000006.
The sample metadata can be found in sample_metadata_file
in the data folder.
We extracted from the pre-processed and labeled Seurat
object from
results/{sample_id}/06_infercnv_HMM-i3_{sample_id}_reference-{reference}.rds.
the following information per cell:
the predicted compartment in
fetal_kidney_predicted.compartment and
fetal_kidney_predicted.compartment.score
the predicted compartment in
fetal_kidney_predicted.cell_type and
fetal_kidney_predicted.cell_type.score
the sample identifier in sample_id
the CNV {dupli|loss} estimation per chromosome {i}
and arm {p|q}: has_{dupli|loss}_chr{i}{p|q},
in 1 to 22
the counts of markers
genes
We will also use:
the samples metadata or clinical data reported in /home/rstudio/data/current/SCPCP000006/single_cell_metadata.tsv
the cytoband file to get the length of each chromosome arm (for plotting only)
# sample metadata
sample_metadata_file <- file.path(repository_base, "data", "current", "SCPCP000006", "single_cell_metadata.tsv")
metadata <- read.table(sample_metadata_file, sep = "\t", header = TRUE)
# the cytoband file that will be used to extract the size of each chromosome arm for plotting only
arm_order_file <- file.path(module_base, "results", "references", "hg38_cytoBand.txt")# Extract from each Seurat object information that will be used
sample_ids <- metadata |>
dplyr::filter(seq_unit != "spot") |>
dplyr::pull(scpca_sample_id) |>
unique()
# These samples were run with "none" as the reference
none_reference_samples <- c("SCPCS000177", "SCPCS000180", "SCPCS000181", "SCPCS000190", "SCPCS000197")
# Create a data frames of all annotations
cell_type_df <- sample_ids |>
purrr::map(
# For each sample_id, do the following:
\(sample_id) {
if (sample_id %in% none_reference_samples) {
reference <- "none"
} else {
reference <- "both"
}
input_file <- file.path(
result_dir,
sample_id,
glue::glue("06_infercnv_HMM-i3_{sample_id}_reference-{reference}.rds")
)
# The file may not be present if this is being run in CI, which is ok.
# If we are not running in CI and the file doesn't exist, we should error out
# We should error out if the file does not exist and we are NOT testing
if (!file.exists(input_file)) {
if (params$testing) {
return(NULL)
} else {
stop("Input RDS file does not exist.")
}
}
# Read in the Seurat object
srat <- readRDS(input_file)
# Create and return a data frame from the Seurat object with relevant annotations
# this data frame will have four columns: barcode, sample_id, compartment, organ
data.frame(
# label transfer from the fetal kidney reference
cell_type = srat$fetal_kidney_predicted.cell_type,
compartment = srat$fetal_kidney_predicted.compartment,
# predicted.scores from the label transfer from the fetal kidney reference
cell_type.score = srat$fetal_kidney_predicted.cell_type.score,
compartment.score = srat$fetal_kidney_predicted.compartment.score,
# cell embedding
umap = srat@reductions$umap@cell.embeddings,
# marker genes
PTPRC = FetchData(object = srat, vars = "ENSG00000081237", layer = "counts"),
VWF = FetchData(object = srat, vars = "ENSG00000110799", layer = "counts"),
VIM = FetchData(object = srat, vars = "ENSG00000026025", layer = "counts"),
CITED1 = FetchData(object = srat, vars = "ENSG00000125931", layer = "counts"),
CDH1 = FetchData(object = srat, vars = "ENSG00000039068", layer = "counts"),
PODXL = FetchData(object = srat, vars = "ENSG00000128567", layer = "counts"),
COL6A3 = FetchData(object = srat, vars = "ENSG00000163359", layer = "counts"),
SIX2 = FetchData(object = srat, vars = "ENSG00000170577", layer = "counts"),
NCAM1 = FetchData(object = srat, vars = "ENSG00000149294", layer = "counts"),
THY1 = FetchData(object = srat, vars = "ENSG00000154096", layer = "counts"),
# loss global estimation per chromosome
has_loss_chr1p = srat$has_loss_chr1p,
has_loss_chr2p = srat$has_loss_chr2p,
has_loss_chr3p = srat$has_loss_chr3p,
has_loss_chr4p = srat$has_loss_chr4p,
has_loss_chr5p = srat$has_loss_chr5p,
has_loss_chr6p = srat$has_loss_chr6p,
has_loss_chr7p = srat$has_loss_chr7p,
has_loss_chr8p = srat$has_loss_chr8p,
has_loss_chr9p = srat$has_loss_chr9p,
has_loss_chr10p = srat$has_loss_chr10p,
has_loss_chr11p = srat$has_loss_chr11p,
has_loss_chr12p = srat$has_loss_chr12p,
has_loss_chr16p = srat$has_loss_chr16p,
has_loss_chr17p = srat$has_loss_chr17p,
has_loss_chr18p = srat$has_loss_chr18p,
has_loss_chr19p = srat$has_loss_chr19p,
has_loss_chr20p = srat$has_loss_chr20p,
has_loss_chr1q = srat$has_loss_chr1q,
has_loss_chr2q = srat$has_loss_chr2q,
has_loss_chr3q = srat$has_loss_chr3q,
has_loss_chr4q = srat$has_loss_chr4q,
has_loss_chr5q = srat$has_loss_chr5q,
has_loss_chr6q = srat$has_loss_chr6q,
has_loss_chr7q = srat$has_loss_chr7q,
has_loss_chr8q = srat$has_loss_chr8q,
has_loss_chr9q = srat$has_loss_chr9q,
has_loss_chr10q = srat$has_loss_chr10q,
has_loss_chr11q = srat$has_loss_chr11q,
has_loss_chr12q = srat$has_loss_chr12q,
has_loss_chr13q = srat$has_loss_chr13q,
has_loss_chr14q = srat$has_loss_chr14q,
has_loss_chr15q = srat$has_loss_chr15q,
has_loss_chr16q = srat$has_loss_chr16q,
has_loss_chr17q = srat$has_loss_chr17q,
has_loss_chr18q = srat$has_loss_chr18q,
has_loss_chr19q = srat$has_loss_chr19q,
has_loss_chr20q = srat$has_loss_chr20q,
has_loss_chr21q = srat$has_loss_chr21q,
has_loss_chr22q = srat$has_loss_chr22q,
# dupli global estimation per chromosome
has_dupli_chr1p = srat$has_dupli_chr1p,
has_dupli_chr2p = srat$has_dupli_chr2p,
has_dupli_chr3p = srat$has_dupli_chr3p,
has_dupli_chr4p = srat$has_dupli_chr4p,
has_dupli_chr5p = srat$has_dupli_chr5p,
has_dupli_chr6p = srat$has_dupli_chr6p,
has_dupli_chr7p = srat$has_dupli_chr7p,
has_dupli_chr8p = srat$has_dupli_chr8p,
has_dupli_chr9p = srat$has_dupli_chr9p,
has_dupli_chr10p = srat$has_dupli_chr10p,
has_dupli_chr11p = srat$has_dupli_chr11p,
has_dupli_chr12p = srat$has_dupli_chr12p,
has_dupli_chr16p = srat$has_dupli_chr16p,
has_dupli_chr17p = srat$has_dupli_chr17p,
has_dupli_chr18p = srat$has_dupli_chr18p,
has_dupli_chr19p = srat$has_dupli_chr19p,
has_dupli_chr20p = srat$has_dupli_chr20p,
has_dupli_chr1q = srat$has_dupli_chr1q,
has_dupli_chr2q = srat$has_dupli_chr2q,
has_dupli_chr3q = srat$has_dupli_chr3q,
has_dupli_chr4q = srat$has_dupli_chr4q,
has_dupli_chr5q = srat$has_dupli_chr5q,
has_dupli_chr6q = srat$has_dupli_chr6q,
has_dupli_chr7q = srat$has_dupli_chr7q,
has_dupli_chr8q = srat$has_dupli_chr8q,
has_dupli_chr9q = srat$has_dupli_chr9q,
has_dupli_chr10q = srat$has_dupli_chr10q,
has_dupli_chr11q = srat$has_dupli_chr11q,
has_dupli_chr12q = srat$has_dupli_chr12q,
has_dupli_chr13q = srat$has_dupli_chr13q,
has_dupli_chr14q = srat$has_dupli_chr14q,
has_dupli_chr15q = srat$has_dupli_chr15q,
has_dupli_chr16q = srat$has_dupli_chr16q,
has_dupli_chr17q = srat$has_dupli_chr17q,
has_dupli_chr18q = srat$has_dupli_chr18q,
has_dupli_chr19q = srat$has_dupli_chr19q,
has_dupli_chr20q = srat$has_dupli_chr20q,
has_dupli_chr21q = srat$has_dupli_chr21q,
has_dupli_chr22q = srat$has_dupli_chr22q
) |>
tibble::rownames_to_column("barcode") |>
dplyr::mutate(sample_id = sample_id)
}
) |>
# now combine all dataframes to make one big one
dplyr::bind_rows()do_Feature_mean shows heatmap of mean expression of a
feature grouped by a metadata.
df is the name of the table containing metadata and
feature expression (counts) per cellsgroup.by is the name of the metadata to group the
cellsfeature is the name of the gene to average the
expressiondo_Feature_mean <- function(df, group.by, feature) {
df <- df |>
group_by(sample_id, !!sym(group.by)) |>
summarise(m = mean(!!sym(feature)))
p <- ggplot(df, aes(x = sample_id, y = !!sym(group.by), fill = m)) +
geom_tile() +
scale_fill_viridis_c() +
theme_bw() +
theme(text = element_text(size = 20)) +
theme(axis.text.x = element_text(angle = 90, hjust = 0.5, vjust = 0.5)) +
guides(fill = guide_colourbar(title = paste0(feature)))
return(p)
}do_BoxPlot_mean shows boxplot of the percentage of cells
with CNV in a specific chromosome arm per clinical relevant group
(metadata)
df is the name of the table containing metadata and
feature expression (counts) per cellsgroup.by is the name of the metadata to group the
cellscomparison is a list of factors in
group.by for which we will compare the means using an
unpaired Wilcoxon testdo_BoxPlot_mean <- function(df, group.by, comparison){
tmp <- df |>
select(!!sym(group.by), sample_id, contains("has_loss_chr"), contains("has_dupli_chr")) |>
group_by(!!sym(group.by), sample_id) |>
summarise_all(.funs = mean) |>
pivot_longer(cols = -c(sample_id,!!sym(group.by)) ) |>
rename("chromosome_arm" = "name", "percentage" = "value") |>
mutate(chromosome_arm = factor(chromosome_arm, levels = c(has_dupli_chr, has_loss_chr)))
p <- ggplot(tmp, aes(x = !!sym(group.by), y = percentage, fill = !!sym(group.by))) +
geom_boxplot()+
scale_fill_brewer(palette="PuRd") +
geom_dotplot(binaxis='y', stackdir='center') +
facet_wrap(~chromosome_arm, nrow = 5 ) +
theme_classic()
return(p)
}As done in 06_cnv_infercnv_exploration.Rmd, we calculate
single CNV score and assess its potential in identifying cells with CNV
versus normal cells without CNV.
We simply checked for each chromosome if the cell
has_cnv_chr. Would the cell have less than 0 chromosome
with CNV, the global has_cnv_score will be FALSE. Would the
cell have more than 2 chromosome with CNV, the global
has_cnv_score will be TRUE. The default is set to
unknown.
The infercnv method can lead to false positive CNV. We
introduced a flexibility of +2 over the cnv_threshold to
reduce the risk of misclassification of normal cells as cancer
cells.
Please note that some cancer cell might not have any CNV. There is thus a risk of misclassifying cancer cells as normal cells. However, we cannot avoid this risk with the data and tools currently available. This is a limitation of our annotations.
cell_type_df <- cell_type_df |>
mutate(has_cnv_score = rowSums(cell_type_df[, grepl("has_loss_chr|has_dupli_chr", colnames(cell_type_df))])) |>
mutate(has_cnv_score = case_when(
has_cnv_score > params$cnv_threshold_high ~ "CNV",
has_cnv_score <= params$cnv_threshold_low ~ "no CNV",
.default = "unknown"
))At first, we like to indicate in the
first.level_annotation if a cell is normal, cancer or
unknown.
normal,endothelium, immune,
stroma or fetal nephron) and do not have CNV.
We only allow a bit of flexibility in terms of CNV profile for immune
and endothelium cells that have a high predicted score. Indeed, we know
that false positive CNV can be observed in a cell type specific manner.
Please note that in 06_infercnv.R, we renamed all
immune and endothelium cells with a
fetal_kidney_predicted.compartment.score > 0.75 as
normal.The threshold used for the predicted.score is defined as
a parameter of this notebook as 0.75. The thresholds used for the
identification of CNV are also defined as notebook parameters with a
minimum of 0 and a maximum of 2.
stroma or
fetal nephron compartments and must have at least few
CNV# Define normal cells
# We first pick up the immune and endothelial cells annotated via the label transfer compartments under the condition that the predicted score is above the threshold
cell_type_df <- cell_type_df |>
mutate(first.level_annotation = case_when(
# assign normal/cancer based on condition
compartment %in% c("fetal_nephron", "stroma") &
has_cnv_score == "no CNV" ~ "normal",
compartment %in% c("normal", "immune", "endothelium") &
cell_type.score > params$predicted.celltype.threshold ~ "normal",
compartment %in% c("fetal_nephron", "stroma") &
has_cnv_score == "CNV" ~ "cancer",
.default = "unknown"
))Using this basic strategy, we identified 156207 cancer cells, 12777 normal cells. 31553cells remain unknown
ggplot(cell_type_df, aes(x = umap.umap_1, y = umap.umap_2, color = first.level_annotation), shape = 19, size = 1) +
geom_point() +
facet_wrap(facets = ~sample_id, ncol = 5) +
theme_bw() +
theme(text = element_text(size = 22))Strikingly, 5 samples show more normal cells than cancer cells:
SCPCS000177SCPCS000180SCPCS000181SCPCS000190SCPCS000197These five samples do not have enough immune and endothelium cell to
be used as a normal reference in scripts/06_infercnv.R and
we previously decided to run scripts/06_infercnv.R without
any reference for these samples. In that case, the mean expression
profile over the sample is taken as the normal reference, biaising the
inference of CNV.
For those samples, the annotations based on infercnv
cannot be trust. To avoid any confusion in the following steps of the
analysis, we decided to force the annotation of the entire samples to
unknown.
Normal cells from the fetal nephron compartment must
be normal kidney cells.
Normal cells from the stroma compartment must be
normal stroma cells.
Immune and endothelial cells have been already identified by label transfer.
Wilms tumor cancer cells can be:
cancer stroma: We define as cancer stroma all cancer cells from the stroma compartment.
blastema,: we defined as blastema every cancer
cell that has a
fetal_kidney_predicted.cell_type == mesenchymal cell. We
know that these mesenchymal cells are cells from the cap
mesenchyme that are not expected to be in a mature kidney. These
blastema cells should express higher CITED1 and/or
NCAM1.
cancer epithelium: we defined as cancer epithelium all cancer cells that are neither stroma nor blastemal cells. We expect these cells to express epithelial markers. Their predicted cell type should correspond to more mature kidney epithelial subunits.
cell_type_df <- cell_type_df |>
mutate(second.level_annotation = case_when(
# assign normal cells based on condition
cell_type_df$compartment %in% c("fetal_nephron") &
cell_type_df$has_cnv_score == "no CNV" ~ "kidney",
cell_type_df$compartment %in% c("stroma") &
cell_type_df$has_cnv_score == "no CNV" ~ "normal stroma",
cell_type_df$compartment %in% c("normal") &
cell_type_df$cell_type %in% c("endothelial cell") ~ "endothelium",
cell_type_df$compartment %in% c("normal") &
cell_type_df$cell_type %in% c("B cell",
"CD4-positive, alpha-beta T cell",
"conventional dendritic cell",
"lymphocyte",
"macrophage",
"mast cell",
"monocyte",
"natural killer cell",
"neutrophil",
"plasmacytoid dendritic cell") ~ "immune",
# assign cancer cells based on condition
cell_type_df$compartment %in% c("stroma") &
cell_type_df$has_cnv_score == "CNV" ~ "cancer stroma",
cell_type_df$compartment %in% c("fetal_nephron") &
cell_type_df$has_cnv_score == "CNV" &
cell_type_df$cell_type == "mesenchymal cell" ~ "blastema",
cell_type_df$compartment %in% c("fetal_nephron") &
cell_type_df$has_cnv_score == "CNV" &
cell_type_df$cell_type != "mesenchymal cell" ~ "cancer epithelium",
.default = "unknown"
))
cell_type_df$second.level_annotation[cell_type_df$sample_id %in% c("SCPCS000177", "SCPCS000180", "SCPCS000181", "SCPCS000190", "SCPCS000197")] <- "unknown"Second.level_annotation of cancer and normal cells -
umap reductionggplot(cell_type_df, aes(x = umap.umap_1, y = umap.umap_2, color = second.level_annotation)) +
geom_point(size = 1, shape = 19) +
facet_wrap(facets = ~sample_id, ncol = 5) +
theme_bw() +
theme(text = element_text(size = 22))Second.level_annotation of cancer and normal cells
without unknown - umap reductioncell_type_df_sub <- cell_type_df[cell_type_df$second.level_annotation != "unknown",]
ggplot(cell_type_df_sub, aes(x = umap.umap_1, y = umap.umap_2, color = second.level_annotation)) +
geom_point(size = 1, shape = 19) +
facet_wrap(facets = ~sample_id, ncol = 5) +
theme_bw() +
theme(text = element_text(size = 22))Per sample, the annotations in the umap reduction seem
to make sense as we can the three main Wilms tumor components are easily
distinguished:
We look for each second.level_annotation and
sample_id at the proportion of cells that has CNV in each
of the chromosome.
These heatmaps allow us to:
validate that mostly tumor cells do present high percentages of cells with CNV (by definition)
identify chromosome and cell types that might be more sensitive to FALSE positive CNV, such as the chr6p which is a hub for immune genes
get the repartition per sample of the CNV
for (i in 1:22) {
print(do_Feature_mean(cell_type_df_sub, group.by = "second.level_annotation", feature = glue::glue("has_loss_chr", i, "q")))
}for (i in 1:22) {
print(do_Feature_mean(cell_type_df_sub, group.by = "second.level_annotation", feature = glue::glue("has_dupli_chr", i, "q")))
}While the above heatmaps can be informative, we have to acknowledge that there are numerous and it is difficult to get a global picture of the CNV profile of the entire dataset.
Here we aim to plot a mean CNV profile across the 35 annotated samples as presented in Figure 1A of Cresswell et al, BioRXiv (2024). This represent the mean SNP Array profile of 64 pediatric kidney cancer (including 60 Wilms tumors).
Here we plot for each chromosome arm (x-axis) the number of samples
having a dupli (y-axis > 0, in red) or a
loss (y-axis < 0, in blue).
For a sample, a CNV is defined as clonal (opaque) if
detected in more than 70% of the cells; and subclonal
(transparent) when detected in 20 to 70% of the cells.
# Here we define the vector of ordered chromosomes arms that will be used for plotting
has_dupli_chr <- c("has_dupli_chr1p",
"has_dupli_chr1q",
"has_dupli_chr2p",
"has_dupli_chr2q",
"has_dupli_chr3p",
"has_dupli_chr3q",
"has_dupli_chr4p",
"has_dupli_chr4q",
"has_dupli_chr5p",
"has_dupli_chr5q",
"has_dupli_chr6p",
"has_dupli_chr6q",
"has_dupli_chr7p",
"has_dupli_chr7q",
"has_dupli_chr8p",
"has_dupli_chr8q",
"has_dupli_chr9p",
"has_dupli_chr9q",
"has_dupli_chr10p",
"has_dupli_chr10q",
"has_dupli_chr11p",
"has_dupli_chr11q",
"has_dupli_chr12p",
"has_dupli_chr12q",
"has_dupli_chr13p",
"has_dupli_chr13q",
"has_dupli_chr14p",
"has_dupli_chr14q",
"has_dupli_chr15p",
"has_dupli_chr15q",
"has_dupli_chr16p",
"has_dupli_chr16q",
"has_dupli_chr17p",
"has_dupli_chr17q",
"has_dupli_chr18p",
"has_dupli_chr18q",
"has_dupli_chr19p",
"has_dupli_chr19q",
"has_dupli_chr20p",
"has_dupli_chr20q",
"has_dupli_chr21p",
"has_dupli_chr21q",
"has_dupli_chr22p",
"has_dupli_chr22q"
)
#################################################################################
# define the vector of ordered chromosomes arms##################################
has_loss_chr <- c("has_loss_chr1p",
"has_loss_chr1q",
"has_loss_chr2p",
"has_loss_chr2q",
"has_loss_chr3p",
"has_loss_chr3q",
"has_loss_chr4p",
"has_loss_chr4q",
"has_loss_chr5p",
"has_loss_chr5q",
"has_loss_chr6p",
"has_loss_chr6q",
"has_loss_chr7p",
"has_loss_chr7q",
"has_loss_chr8p",
"has_loss_chr8q",
"has_loss_chr9p",
"has_loss_chr9q",
"has_loss_chr10p",
"has_loss_chr10q",
"has_loss_chr11p",
"has_loss_chr11q",
"has_loss_chr12p",
"has_loss_chr12q",
"has_loss_chr13p",
"has_loss_chr13q",
"has_loss_chr14p",
"has_loss_chr14q",
"has_loss_chr15p",
"has_loss_chr15q",
"has_loss_chr16p",
"has_loss_chr16q",
"has_loss_chr17p",
"has_loss_chr17q",
"has_loss_chr18p",
"has_loss_chr18q",
"has_loss_chr19p",
"has_loss_chr19q",
"has_loss_chr20p",
"has_loss_chr20q",
"has_loss_chr21p",
"has_loss_chr21q",
"has_loss_chr22p",
"has_loss_chr22q"
)
#################################################################################
# Here we define the size of each chromosome arm that will be used for plotting of the mean CNV profile
# Load cytoBand file
cytoBand <- read.table(gzfile(arm_order_file), header = FALSE, sep = "\t", stringsAsFactors = FALSE)
colnames(cytoBand) <- c("chrom", "start", "end", "band", "stain")
# Extract chromosome arm information
cytoBand <- cytoBand |>
mutate(arm = substr(band, 1, 1)) # Extract 'p' or 'q' from the band column
# Define arm boundaries
chromosome_arms <- cytoBand |>
group_by(chrom, arm) |>
summarize(
Start = min(start),
End = max(end),
.groups = "drop"
) |>
mutate(Size = End - Start,
dupli = paste0("has_dupli_", chrom, arm),
loss = paste0("has_loss_", chrom, arm)) |>
filter(arm != "")
chromosome_arms <- chromosome_arms |>
mutate(Size = Size / sum(Size))# prepare the table for the number of sample with a (sub)clonal loss
CNV_profile_loss <- cell_type_df_sub |>
select(sample_id, contains("has_loss_chr")) |>
group_by(sample_id) |>
summarise_all(.funs = mean) |>
pivot_longer(cols = -sample_id) |>
rename("chromosome_arm" = "name", "percentage" = "value") |>
left_join( chromosome_arms, by = c("chromosome_arm"="loss")) |>
mutate(chromosome_arm = factor(chromosome_arm, levels = has_loss_chr)) |>
mutate(absolute_70 = case_when((percentage > 0.7) ~ -1,
.default = 0),
absolute_10 = case_when((percentage > 0.1) ~ -1,
.default = 0))
# prepare the table for the number of sample with a (sub)clonal gain
CNV_profile_gain <- cell_type_df_sub |>
select(sample_id, contains("has_dupli_chr")) |>
group_by(sample_id) |>
summarise_all(.funs = mean) |>
pivot_longer(cols = -sample_id) |>
rename("chromosome_arm" = "name", "percentage" = "value") |>
left_join( chromosome_arms, by = c("chromosome_arm"="dupli")) |>
mutate(chromosome_arm = factor(chromosome_arm, levels = has_dupli_chr)) |>
mutate(absolute_70 = case_when((percentage > 0.7) ~ 1,
.default = 0),
absolute_10 = case_when((percentage > 0.1) ~ 1,
.default = 0))
# plot
loss_profile <- ggplot(CNV_profile_loss, aes(x = chromosome_arm, y = absolute_10, width = 20 * Size)) +
geom_bar(stat = "identity", fill = "#41b6c4", alpha = 0.4) +
geom_bar(aes(x = chromosome_arm, y = absolute_70, width = 20 * Size), stat = "identity", fill = "#41b6c4") +
scale_x_discrete(labels = gsub("has_loss_", "", levels(droplevels(CNV_profile_loss$chromosome_arm)))) +
theme_classic() +
ylab("Number of occurrence") +
xlab("chromosom arms") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1, size = 12), text = element_text(size = 15))## Warning in geom_bar(aes(x = chromosome_arm, y = absolute_70, width = 20 * :
## Ignoring unknown aesthetics: width
gain_profile <- ggplot(CNV_profile_gain, aes(chromosome_arm, y = absolute_10, width = 20* Size)) +
geom_bar(stat = "identity", fill = "#ce1256", alpha = 0.4) +
geom_bar(aes(x = chromosome_arm, y = absolute_70, width = 20 * Size), stat = "identity", fill = "#ce1256") +
theme_classic() +
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
text = element_text(size = 15)) +
ylab(" ") ## Warning in geom_bar(aes(x = chromosome_arm, y = absolute_70, width = 20 * :
## Ignoring unknown aesthetics: width
Comparing with the profile in Cresswell et al, BioRXiv (2024), we identified important CNV known to play a role in Wilms tumor, including:
chr8q, chr12, chr13, chr18q gains
chr11q, chr16, chr17q loss
However, we spotted some differences:
we couldn’t detect any chr4q loss as described in Cresswell et al, BioRXiv (2024).
we identified 2 losses in chr5 and chr6 that do
not show up in Cresswell
et al, BioRXiv (2024). These can be infercnv induced
_FALSE_positive CNV.
Our major concern is the pattern of the chr1. We do see a
tendency of chr1p loss and chr1q gain but it is not as
pronounced as in Cresswell
et al, BioRXiv (2024). chr1q gain is however one of the
most promising predictive marker of Wilms tumor outcome (Nelson et al,
Curr Opin Pediatr (2020)). It is however important to notice that
the profile shown in Cresswell
et al, BioRXiv (2024) derived from SIOP samples that have been
pre-treated with chemotherapy, while samples in the dataset are enriched
in upfront resection samples that haven’t been pre-treated,
according to the COG protocol. So far, we do not know how the
pre-operative chemotherapy impact the clonal selection of specific
CNV.
vital_statusAs described in Nelson et al, Curr Opin Pediatr (2020), Phelps et al, Children (2018) and Zheng et al, Front. Oncol (2023), LOH at chr1p, chr11p and chr16q as well as chr1q gain are prognostic factors.
Here we aim to see if we identify CNV associated with poorer prognosis.
do_BoxPlot_mean(df = cell_type_df_sub, group.by = "vital_status", comparison = c("Alive", "Expired") )In our study, we validate the association of chr1q gain, chr11q loss and chr14q loss with poorer prognosis.
treatmentAs we don’t know the effect of the treatment on the selection of
cells with CNV, it is difficult to compare our CNV profile with the one
in Cresswell
et al, BioRXiv (2024). We those check the repartition of CNV in
samples that have been pre-treated (i.e
Resection post chemotherapy) with samples that have been
removed surgically before any treatment (i.e
Upfront resection).
do_BoxPlot_mean(df = cell_type_df_sub, group.by = "treatment", comparison = c("Upfront resection", "Resection post chemotherapy") )Our analysis suggest that there is indeed an enrichment in cells with CNV after chemotherapy, including a tendency of more chr1q gain samples after chemotherapy. This could explain some of the differences observed between our mean CNV profile and the one published in Cresswell et al, BioRXiv (2024).
Finally, we aim to validate the second level annotation
using marker genes.
PTPRC expressiondo_Feature_mean(cell_type_df_sub, group.by = "second.level_annotation", feature = "ENSG00000081237")Immune cells are well annotated based on PTPRC expression.
VWF expressiondo_Feature_mean(cell_type_df_sub, group.by = "second.level_annotation", feature = "ENSG00000110799")Endothelial cells are well annotated based on VWF expression.
(*) To the best of our knowledges, there is no Wilms tumor specific marker of stroma cells. They should express in some extend similar markers as normal stromal cells including VIM, THY1 and COL6A3.
Vimentin expressiondo_Feature_mean(cell_type_df_sub, group.by = "second.level_annotation", feature = "ENSG00000026025")VIM do not seem to be neither specific nor universal.
(*) To the best of our knowledges, there is no Wilms tumor specific and universel (i.e. expressed by all cells in every sample) marker of blastema cells. We expect however blastema cells to express higher levels of stemness markers such as CITED1, SIX1/2, NCAM1.
CITED1 expressiondo_Feature_mean(cell_type_df_sub, group.by = "second.level_annotation", feature = "ENSG00000125931")Blastema cells are enriched in CITED1+ cells, but some samples do not express it at all.
This section creates the cell type annotation table for export.
annotations_table <- cell_type_df |>
dplyr::select(
cell_barcode = barcode,
scpca_sample_id = sample_id,
tumor_cell_classification = first.level_annotation,
cell_type_assignment = second.level_annotation
) |>
mutate(
# change cancer --> tumor, but keep the other labels
tumor_cell_classification = ifelse(
tumor_cell_classification == "cancer", "tumor", tumor_cell_classification
),
cell_type_assignment = str_replace_all(
cell_type_assignment,
"cancer ",
"tumor "
)
)
write_tsv(annotations_table, annotations_tsv)Confirm how many samples we have annotations for:
## [1] 40
Combining label transfer and CNV inference we have produced draft
annotations for 35/40 Wilms tumor samples in SCPCP000006. We decided not
to annotate samples for which the infercnv couldn’t be run
with a normal reference, as we don’t trust the results in that case and
want to avoid misclassification (SCPCS000177,
SCPCS000180, SCPCS000181,
SCPCS000190, SCPCS000197).
The mean CNV profile of the Wilms tumor cohort seems reasonable
in regards to the one published in Cresswell
et al, BioRXiv (2024). We identify some commonalities but also some
divergence. They can be due to technical limitation of our CNV inference
but can also reflect some specificities of this cohort (e.g. pre/post
treatment samples). It would be really useful to compare for some sample
the infercnv result to a ground truth such as SNP Array (to
be requested).
The heatmaps of CNV proportion and marker genes support our annotations, but signals with some marker genes are very low. Also, there is no universal marker for each entity of Wilms tumor that cover all tumor cells from all patient. This makes the validation of the annotations quite difficult.
However, we could try to take the problem from the other side, and used the current annotation to perform differential expression analysis and try to find marker genes that are consistent across patient and Wilms tumor histologies.
In each histology (i.e. epithelial and stroma), the distinction between cancer and non cancer cell is difficult (as expected). In this analysis, we suggested to rely on the CNV score to assess the normality of the cell. Here again, we could try to run differential expression analysis and compare epithelial (resp. stroma) cancer versus non-cancer cells across patient, aiming to find a share transcriptional program allowing the classification cancer versus normal.
In our annotation, we haven’t taken into account the favorable/anaplastic status of the sample. However, as anaplasia can occur in every (but do not has to) Wilms tumor histology, I am not sure how to integrate the information into the annotation.
This notebook could be finally rendered using different parameters, i.e. threshold for the CNV score and predicted score to use.
# record the versions of the packages used in this analysis and other environment information
sessionInfo()## R version 4.4.1 (2024-06-14)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 22.04.4 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so; LAPACK version 3.10.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Europe/Vienna
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] DT_0.33 patchwork_1.2.0 lubridate_1.9.3 forcats_1.0.0
## [5] stringr_1.5.1 dplyr_1.1.4 purrr_1.0.2 readr_2.1.5
## [9] tidyr_1.3.1 tibble_3.2.1 ggplot2_3.5.1 tidyverse_2.0.0
## [13] Seurat_5.1.0 SeuratObject_5.0.2 sp_2.1-4
##
## loaded via a namespace (and not attached):
## [1] RColorBrewer_1.1-3 rstudioapi_0.16.0 jsonlite_1.8.8
## [4] magrittr_2.0.3 spatstat.utils_3.1-0 farver_2.1.2
## [7] rmarkdown_2.28 vctrs_0.6.5 ROCR_1.0-11
## [10] spatstat.explore_3.3-2 htmltools_0.5.8.1 sass_0.4.9
## [13] sctransform_0.4.1 parallelly_1.38.0 KernSmooth_2.23-24
## [16] bslib_0.8.0 htmlwidgets_1.6.4 ica_1.0-3
## [19] plyr_1.8.9 plotly_4.10.4 zoo_1.8-12
## [22] cachem_1.1.0 igraph_2.0.3 mime_0.12
## [25] lifecycle_1.0.4 pkgconfig_2.0.3 Matrix_1.7-0
## [28] R6_2.5.1 fastmap_1.2.0 fitdistrplus_1.2-1
## [31] future_1.34.0 shiny_1.9.1 digest_0.6.37
## [34] colorspace_2.1-1 rprojroot_2.0.4 tensor_1.5
## [37] RSpectra_0.16-2 irlba_2.3.5.1 crosstalk_1.2.1
## [40] labeling_0.4.3 progressr_0.14.0 fansi_1.0.6
## [43] spatstat.sparse_3.1-0 timechange_0.3.0 httr_1.4.7
## [46] polyclip_1.10-7 abind_1.4-5 compiler_4.4.1
## [49] bit64_4.0.5 withr_3.0.1 fastDummies_1.7.4
## [52] highr_0.11 MASS_7.3-61 tools_4.4.1
## [55] lmtest_0.9-40 httpuv_1.6.15 future.apply_1.11.2
## [58] goftest_1.2-3 glue_1.7.0 nlme_3.1-166
## [61] promises_1.3.0 grid_4.4.1 Rtsne_0.17
## [64] cluster_2.1.6 reshape2_1.4.4 generics_0.1.3
## [67] gtable_0.3.5 spatstat.data_3.1-2 tzdb_0.4.0
## [70] data.table_1.16.0 hms_1.1.3 utf8_1.2.4
## [73] spatstat.geom_3.3-2 RcppAnnoy_0.0.22 ggrepel_0.9.5
## [76] RANN_2.6.2 pillar_1.9.0 vroom_1.6.5
## [79] spam_2.10-0 RcppHNSW_0.6.0 later_1.3.2
## [82] splines_4.4.1 lattice_0.22-6 bit_4.0.5
## [85] survival_3.7-0 deldir_2.0-4 tidyselect_1.2.1
## [88] miniUI_0.1.1.1 pbapply_1.7-2 knitr_1.48
## [91] gridExtra_2.3 scattermore_1.2 xfun_0.47
## [94] matrixStats_1.3.0 stringi_1.8.4 lazyeval_0.2.2
## [97] yaml_2.3.10 evaluate_0.24.0 codetools_0.2-20
## [100] cli_3.6.3 uwot_0.2.2 xtable_1.8-4
## [103] reticulate_1.38.0 munsell_0.5.1 jquerylib_0.1.4
## [106] Rcpp_1.0.14 globals_0.16.3 spatstat.random_3.3-1
## [109] png_0.1-8 spatstat.univar_3.0-0 parallel_4.4.1
## [112] dotCall64_1.1-1 listenv_0.9.1 viridisLite_0.4.2
## [115] scales_1.3.0 ggridges_0.5.6 crayon_1.5.3
## [118] leiden_0.4.3.1 rlang_1.1.4 cowplot_1.1.3